#!/usr/bin/perl
#Programmer: Rory Carmichael
#Usage: group_stats.pl <folder of aligned orthology groups>

my $dir = shift;
chomp $dir;

my %org_hash;
my %seq_hash;
my %dash_hash;
my $cnt = 0;
foreach my $file (`ls $dir`) {
	chomp $file;
	$cnt++;
	my @seqs = `cat $dir/$file | grep ">"`;
	$seq_hash{$#seqs+1} += 1;
	my %orgs;
	foreach my $seq (@seqs) {
		chomp $seq;
		my $org = [split(/\|/, $seq)]->[0];
		$orgs{$org} = 1;
	}
	my $org_cnt = scalar(keys(%orgs));
	$org_hash{$org_cnt}++;
	my @lines = `cat $dir/$file | grep -v ">"`;
	my $mean_dash = 0;
	foreach my $line (@lines) {
		$line =~ s/^[-]//;
		$mean_dash += length($line);
	}
	$mean_dash = $mean_dash/($#lines + 1);
	$dash_hash{$mean_dash}++;
}
print "HISTOGRAM OF ORGANISMS PER CLUSTER\n";
foreach my $key (sort(keys(%org_hash))) {
	print "$key\t" . $org_hash{$key} . "\n";

}
print "HISTOGRAM OF SEQUENCES PER CLUSTER\n";
foreach my $key (sort(keys(%seq_hash))) {
	print "$key\t" . $seq_hash{$key} . "\n";
}
print "HISTOGRAM OF MEAN GAPS PER CLUSTER\n";
foreach my $key (sort(keys(%dash_hash))) {
	print "$key\t" . $dash_hash{$key} . "\n";
}
